In [1]:
    
import tensorflow as tf
import pandas as pd
import numpy as np
import shutil
import math
import multiprocessing
from datetime import datetime
from tensorflow.python.feature_column import feature_column
print(tf.__version__)
    
    
    
In [2]:
    
MODEL_NAME = 'reg-model-01'
TRAIN_DATA_FILE = 'data/train-data.csv'
VALID_DATA_FILE = 'data/valid-data.csv'
TEST_DATA_FILE = 'data/test-data.csv'
RESUME_TRAINING = False
PROCESS_FEATURES = True
MULTI_THREADING = False
    
In [3]:
    
HEADER = ['key','x','y','alpha','beta','target']
HEADER_DEFAULTS = [[0], [0.0], [0.0], ['NA'], ['NA'], [0.0]]
NUMERIC_FEATURE_NAMES = ['x', 'y']  
CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {'alpha':['ax01', 'ax02'], 'beta':['bx01', 'bx02']}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys())
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
TARGET_NAME = 'target'
UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME})
print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {}".format(TARGET_NAME))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))
    
    
In [4]:
    
def process_dataframe(dataset_df):
    
    dataset_df["x_2"] = np.square(dataset_df['x'])
    dataset_df["y_2"] = np.square(dataset_df['y'])
    dataset_df["xy"] = dataset_df['x'] * dataset_df['y']
    dataset_df['dist_xy'] =  np.sqrt(np.square(dataset_df['x']-dataset_df['y']))
    
    return dataset_df
def generate_pandas_input_fn(file_name, mode=tf.estimator.ModeKeys.EVAL,
                             skip_header_lines=0,
                             num_epochs=1,
                             batch_size=100):
    df_dataset = pd.read_csv(file_name, names=HEADER, skiprows=skip_header_lines)
    
    x = df_dataset[FEATURE_NAMES].copy()
    if PROCESS_FEATURES:
        x = process_dataframe(x)
    
    y = df_dataset[TARGET_NAME]
        
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    
    num_threads=1
    
    if MULTI_THREADING:
        num_threads=multiprocessing.cpu_count()
        num_epochs = int(num_epochs/num_threads) if mode == tf.estimator.ModeKeys.TRAIN else num_epochs
    
    pandas_input_fn = tf.estimator.inputs.pandas_input_fn(
        batch_size=batch_size,
        num_epochs= num_epochs,
        shuffle=shuffle,
        x=x,
        y=y,
        target_column=TARGET_NAME
    )
    
    print("")
    print("* data input_fn:")
    print("================")
    print("Input file: {}".format(file_name))
    print("Dataset size: {}".format(len(df_dataset)))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")
    
    return pandas_input_fn
    
In [5]:
    
features, target = generate_pandas_input_fn(file_name=TRAIN_DATA_FILE)()
print("Feature read from DataFrame: {}".format(list(features.keys())))
print("Target read from DataFrame: {}".format(target))
    
    
In [6]:
    
def get_feature_columns():
    
    
    all_numeric_feature_names = NUMERIC_FEATURE_NAMES
    
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = ['x_2', 'y_2', 'xy', 'dist_xy']
    
    if PROCESS_FEATURES:
        all_numeric_feature_names += CONSTRUCTED_NUMERIC_FEATURES_NAMES
    numeric_columns = {feature_name: tf.feature_column.numeric_column(feature_name)
                       for feature_name in all_numeric_feature_names}
    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
        
    feature_columns = {}
    if numeric_columns is not None:
        feature_columns.update(numeric_columns)
    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
        
    # add extended features (crossing, bucektization, embedding)
    
    feature_columns['alpha_X_beta'] = tf.feature_column.crossed_column(
        [feature_columns['alpha'], feature_columns['beta']], 4)
    
    return feature_columns
feature_columns = get_feature_columns()
print("Feature Columns: {}".format(feature_columns))
    
    
In [7]:
    
def create_estimator(run_config, hparams):
    
    feature_columns = list(get_feature_columns().values())
    
    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column._NumericColumn),
               feature_columns
        )
    )
    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),
                   feature_columns)
    )
    indicator_columns = list(
            map(lambda column: tf.feature_column.indicator_column(column),
                categorical_columns)
    )
    
    
    estimator_feature_columns = dense_columns + indicator_columns 
    
    estimator = tf.estimator.DNNRegressor(
        
        feature_columns= estimator_feature_columns,
        hidden_units= hparams.hidden_units,
        
        optimizer= tf.train.AdamOptimizer(),
        activation_fn= tf.nn.elu,
        dropout= hparams.dropout_prob,
        
        config= run_config
    )
    
    print("")
    print("Estimator Type: {}".format(type(estimator)))
    print("")
    
    return estimator
    
In [8]:
    
hparams  = tf.contrib.training.HParams(
    num_epochs = 100,
    batch_size = 500,
    hidden_units=[8, 4], 
    dropout_prob = 0.0)
model_dir = 'trained_models/{}'.format(MODEL_NAME)
run_config = tf.estimator.RunConfig().replace(model_dir=model_dir)
print("Model directory: {}".format(run_config.model_dir))
print("Hyper-parameters: {}".format(hparams))
    
    
In [9]:
    
estimator = create_estimator(run_config, hparams)
    
    
In [10]:
    
train_input_fn = generate_pandas_input_fn(file_name= TRAIN_DATA_FILE, 
                                      mode=tf.estimator.ModeKeys.TRAIN,
                                      num_epochs=hparams.num_epochs,
                                      batch_size=hparams.batch_size) 
if not RESUME_TRAINING:
    shutil.rmtree(model_dir, ignore_errors=True)
    
tf.logging.set_verbosity(tf.logging.INFO)
time_start = datetime.utcnow() 
print("Estimator training started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................")
estimator.train(input_fn = train_input_fn)
time_end = datetime.utcnow() 
print(".......................................")
print("Estimator training finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Estimator training elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    
In [11]:
    
TEST_SIZE = 5000
test_input_fn = generate_pandas_input_fn(file_name=TEST_DATA_FILE, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)
results = estimator.evaluate(input_fn=test_input_fn)
print("")
print(results)
rmse = round(math.sqrt(results["average_loss"]),5)
print("")
print("RMSE: {}".format(rmse))
    
    
In [12]:
    
import itertools
predict_input_fn = generate_pandas_input_fn(file_name=TEST_DATA_FILE, 
                                      mode= tf.estimator.ModeKeys.PREDICT,
                                      batch_size= 5)
predictions = estimator.predict(input_fn=predict_input_fn)
values = list(map(lambda item: item["predictions"][0],list(itertools.islice(predictions, 5))))
print()
print("Predicted Values: {}".format(values))
    
    
In [1]:
    
def process_features(features):
    
    features["x_2"] = tf.square(features['x'])
    features["y_2"] = tf.square(features['y'])
    features["xy"] = tf.multiply(features['x'], features['y'])
    features['dist_xy'] =  tf.sqrt(tf.squared_difference(features['x'],features['y']))
    
    return features
def csv_serving_input_fn():
    
    SERVING_HEADER = ['x','y','alpha','beta']
    SERVING_HEADER_DEFAULTS = [[0.0], [0.0], ['NA'], ['NA']]
    rows_string_tensor = tf.placeholder(dtype=tf.string,
                                         shape=[None],
                                         name='csv_rows')
    
    receiver_tensor = {'csv_rows': rows_string_tensor}
    row_columns = tf.expand_dims(rows_string_tensor, -1)
    columns = tf.decode_csv(row_columns, record_defaults=SERVING_HEADER_DEFAULTS)
    features = dict(zip(SERVING_HEADER, columns))
    
    if PROCESS_FEATURES:
        features = process_features(features)
    return tf.estimator.export.ServingInputReceiver(
        features, receiver_tensor)
    
In [31]:
    
export_dir = model_dir + "/export"
estimator.export_savedmodel(
    export_dir_base = export_dir,
    serving_input_receiver_fn = csv_serving_input_fn,
    as_text=True
)
    
    
    Out[31]:
In [35]:
    
import os
saved_model_dir = export_dir + "/" + os.listdir(path=export_dir)[-1] 
print(saved_model_dir)
predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
    signature_def_key="predict"
)
output = predictor_fn({'csv_rows': ["0.5,1,ax01,bx02", "-0.5,-1,ax02,bx02"]})
print(output)
    
    
In [ ]: